library(tidyverse)
library(dplyr)
library(maps)
library(spBayes)
library(viridis)
########
# Datasets description
# PA timestamp is recorded in GMT
# EPA timestamp is recorded in both GMT and EST
# However, the time difference between GMT and EST is always the same (5 hr)
# Which means that EPA timestamp may not account for the daytime light saving
# The latest EPA update is 08/31/2020
setwd("/Users/hongjianyang/Research/PA/")
pa2018 <- read.csv('Data/NC_PA_FRM_PM/PA_2018_hourly_PM_NC.csv')
# Subset data
col <- c('Lon', 'Lat', 'Timestamp', 'PM25_corrected')
pa <- pa2018 %>%
  select(col) %>%
  rename(PM25 = PM25_corrected) %>%
  group_by(Lon, Lat, Timestamp) %>%
  summarise(PM25 = mean(PM25))

pa <- pa[order(pa$Lon, pa$Lat, pa$Timestamp), ]
# Remove NAs
pa <- pa[complete.cases(pa), ]

# Get 12/21/2018 - 12/21/2018 data
pa$Timestamp <- as.POSIXct(pa$Timestamp)
startTime <- as.POSIXct('2018-12-21 00:00:00')
endTime <- as.POSIXct('2018-12-21 23:59:59')
pa <- subset(pa, Timestamp <= endTime)
pa <- subset(pa, Timestamp >= startTime)

start <- as.numeric(startTime)
end <- as.numeric(as.POSIXct('2018-12-21 23:00:00'))
diff <- end - start

# Prediction locations
# Longitude first, then latitude
s01   <- seq(-90,-65,0.1) # Lon
s02   <- seq(30,38,0.1) # Lat
s0    <- as.matrix(expand.grid(s01,s02))
innc <- map.where(map('state', 'north carolina', fill = TRUE), s0[,1], s0[,2])

s0    <- s0[!is.na(innc),]

X0     <- cbind(1,s0)
X0[,2] <- (X0[,2]-mean(X0[, 2])) / sd(X0[,2])
X0[,3] <- (X0[,3]-mean(X0[,3])) / sd(X0[,3])

lon <- X0[, 2]
lat <- X0[, 3]
lon2 <- lon * lon
lat2 <- lat * lat
lonlat <- lon * lat

X0 <- cbind(1, lat, lon, lat2, lon2, lonlat)

## Bayesian Kriging set up
library(spBayes)
library(viridis)
n.samples <- 25000
burn      <- 5000

n_timestamp <- diff / 3600 + 1
one_var <- vector(length = n_timestamp)
for (i in 0:(n_timestamp - 1)) {
  current <- as.POSIXct(3600 * i, origin = startTime)
  subdf <- subset(pa, Timestamp == current)
  S <- data.matrix(subdf[, 1:2])
  maxd      <- max(dist(S))
  
  # Construct locations
  X <- data.frame(cbind(1, S))
  X[, 2] <- (X[, 2] - mean(X[, 2])) / sd(X[, 2])
  X[, 3] <- (X[, 3] - mean(X[, 3])) / sd(X[, 3])
  lon <- X[, 2]
  lat <- X[, 3]
  lon2 <- lon * lon
  lat2 <- lat * lat
  lonlat <- lon * lat
  X <- cbind(1, lon, lat, lon2, lat2, lonlat)
  
  print(dim(df))
  # Kriging
  Y <- subdf$PM25
  
  starting  <- list("phi"=1/(0.05*maxd), "sigma.sq"=0.5*var(Y), "tau.sq"=0.5*var(Y))
  tuning    <- list("phi"=1, "sigma.sq"=0.05*var(Y), "tau.sq"=0.05*var(Y))
  amcmc     <- list("n.batch"=n.samples/100, "batch.length"=100, "accept.rate"=0.43)
  priors    <- list("beta.Norm"=list(rep(0,6), 100*diag(6)),
                    "phi.Unif"=c(1/(2*maxd), 1/(0.01*maxd)),
                    "sigma.sq.IG"=c(2, 1),
                    "tau.sq.IG"=c(2, 1))
  
  fit_spbayes  <- spLM(Y~X-1, coords=S,
                       starting=starting, tuning=tuning, 
                       priors=priors, cov.model="exponential",
                       amcmc=amcmc, n.samples=n.samples,verbose=FALSE)
  
  pred <- spPredict(fit_spbayes, pred.coords=s0, pred.covars=X0, 
                    start=burn, thin=10, verbose=FALSE)
  pred <- pred$p.y.predictive.samples
  
  Yhat  <- apply(pred,1,mean)
  Ysd   <- apply(pred,1,sd)
  
  df <- data.frame(long=s0[,1],lat=s0[,2],Y=Ysd)
  # Samples near RTP area
  # Longitude range: -78.89, -78.52; Latitude range: 35.71, 35.92
  rtp_sample <- subset(df, df$long <=-78.52 & df$long >= -78.89 & df$lat >= 35.71
                       & df$lat <= 35.92)
  rtp_std <- mean(rtp_sample$Y)
  one_var[i + 1] = rtp_std
  # Plot standard deviation
  pred_var <- ggplot(df, aes(long, lat)) +
    geom_polygon(aes(x=long, y=lat)) +
    geom_raster(aes(fill = Y)) +
    scale_fill_gradientn(colours = viridis(10))+
    xlab(current)+ylab("")+labs(title="Posterior predictive standard deviation")+
    coord_fixed()
  plot(pred_var)
}
## NULL

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

map <- map_data("county")
nc <- subset(map, region =="north carolina")
R <- ggplot(data=nc) + 
  geom_polygon(aes(x=long, y=lat, group = group)) +
  geom_point(data = subdf, aes(Lon, Lat), colour = 'purple', size = 0.5) +
  coord_fixed()
R

###########################################################################
###########################################################################
###########################################################################
# 2020 Data
# Explore variance between 07/01/2020 - 07/07/2020
# 29 epa stations, 89 purple air stations (07/01)
###########################################################################
###########################################################################
###########################################################################
pa2020 <- read.csv('Data/NC_PA_FRM_PM/PA_2020_hourly_PM_NC.csv')
epa2020 <- read.csv('Data/NC_PA_FRM_PM/FRM_2020_hourly_PM_NC.csv')
# Subset data
col <- c('Lon', 'Lat', 'Timestamp', 'PM25_corrected')
pa <- pa2020 %>%
  select(col) %>%
  rename(PM25 = PM25_corrected) %>%
  group_by(Lon, Lat, Timestamp) %>%
  summarise(PM25 = mean(PM25))

pa <- pa[order(pa$Lon, pa$Lat, pa$Timestamp), ]
# Remove NAs
pa <- pa[complete.cases(pa), ]

# Process EPA data
epa2020$Timestamp <- paste(epa2020$Date_GMT, epa2020$Hour_GMT)
epa2020$Timestamp <- paste(epa2020$Timestamp, ':00', sep = '')
col <- c('Lon', 'Lat', 'Timestamp', 'PM25')
epa <- epa2020 %>% 
  select(col)
epa <- epa[order(epa$Lon, epa$Lat, epa$Timestamp), ]

# Get 07/01/2020 - 07/07/2020 data
pa$Timestamp <- as.POSIXct(pa$Timestamp)
epa$Timestamp <- as.POSIXct(epa$Timestamp, format = '%Y-%m-%d %H:%M:%OS')
startTime <- as.POSIXct('2020-07-01 00:00:00')
endTime <- as.POSIXct('2020-07-07 23:59:59')
pa <- subset(pa, Timestamp <= endTime & Timestamp >= startTime)
epa <- subset(epa, Timestamp <= endTime & Timestamp >= startTime)

start <- as.numeric(startTime)
end <- as.numeric(as.POSIXct('2020-07-07 23:00:00'))
diff <- end - start

epa$indicator = 1
pa$indicator = 0

df_combine <- rbind(epa, pa)


X0 <- cbind(X0, 0)

n_timestamp <- diff / 3600 + 1
one_var <- vector(length = n_timestamp)

# Model: Y = \beta_{0} + \beta_{1}*Lon + \beta_{2}*Lat + \beta_{3}*Lon^2 + 
# \beta_{4}*Lat^2 + \beta_{5}*LonLat + \beta_{6}I(epa)
for (i in 0:(n_timestamp - 1)) {

  current <- as.POSIXct(3600 * i, origin = startTime)

  
  subdf <- subset(df_combine, Timestamp == current)
  S <- data.matrix(subdf[, 1:2])
  maxd      <- max(dist(S))
  
  # Construct locations
  X <- data.frame(cbind(1, S))
  X[, 2] <- (X[, 2] - mean(X[, 2])) / sd(X[, 2])
  X[, 3] <- (X[, 3] - mean(X[, 3])) / sd(X[, 3])
  lon <- X[, 2]
  lat <- X[, 3]
  lon2 <- lon * lon
  lat2 <- lat * lat
  lonlat <- lon * lat
  X <- cbind(1, lon, lat, lon2, lat2, lonlat, subdf$indicator)
  
  print(dim(df))
  # Kriging
  Y <- subdf$PM25
  
  starting  <- list("phi"=1/(0.05*maxd), "sigma.sq"=0.5*var(Y), "tau.sq"=0.5*var(Y))
  tuning    <- list("phi"=1, "sigma.sq"=0.05*var(Y), "tau.sq"=0.05*var(Y))
  amcmc     <- list("n.batch"=n.samples/100, "batch.length"=100, "accept.rate"=0.43)
  priors    <- list("beta.Norm"=list(rep(0,7), 100*diag(7)),
                    "phi.Unif"=c(1/(2*maxd), 1/(0.01*maxd)),
                    "sigma.sq.IG"=c(2, 1),
                    "tau.sq.IG"=c(2, 1))
  
  fit_spbayes  <- spLM(Y~X-1, coords=S,
                       starting=starting, tuning=tuning, 
                       priors=priors, cov.model="exponential",
                       amcmc=amcmc, n.samples=n.samples,verbose=FALSE)
  
  pred <- spPredict(fit_spbayes, pred.coords=s0, pred.covars=X0, 
                    start=burn, thin=10, verbose=FALSE)
  
  pred <- pred$p.y.predictive.samples
  
  Yhat  <- apply(pred,1,mean)
  Ysd   <- apply(pred,1,sd)
  
  df <- data.frame(long=s0[,1],lat=s0[,2],Y=Ysd)
  # Samples near RTP area
  # Longitude range: -78.89, -78.52; Latitude range: 35.71, 35.92
  rtp_sample <- subset(df, df$long <=-78.52 & df$long >= -78.89 & df$lat >= 35.71
                       & df$lat <= 35.92)
  rtp_std <- mean(rtp_sample$Y)
  one_var[i + 1] = rtp_std
  # Plot standard deviation
  pred_var <- ggplot(df, aes(long, lat)) +
    geom_polygon(aes(x=long, y=lat)) +
    geom_raster(aes(fill = Y)) +
    scale_fill_gradientn(colours = viridis(10))+
    xlab(current)+ylab("")+labs(title="Posterior predictive standard deviation")+
    coord_fixed()
  plot(pred_var)
}
## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

## [1] 1264    3

map <- map_data("county")
nc <- subset(map, region =="north carolina")
R <- ggplot(data=nc) + 
  geom_polygon(aes(x=long, y=lat, group = group)) +
  geom_point(data = subdf, aes(Lon, Lat), colour = 'purple', size = 0.5) +
  coord_fixed()
R

one_var
##   [1]  1.9052051  1.5893965  1.5114837  1.6707842  1.4343314  1.6064139
##   [7]  1.4748075  1.3460338  1.4606207  1.3099679  1.4841957  5.0005923
##  [13]  0.9644108  1.3168099  3.0002070  1.9856533  1.4075152  1.1871802
##  [19]  2.3033336  1.2084476  1.4426470  3.5227921  1.7692592  1.8061895
##  [25]  1.5714654  1.6002801  1.6811134  1.5773866  1.7422996  1.7006562
##  [31]  1.3775725  1.3680586  1.6216852  1.1399162  1.2727918  1.1216875
##  [37]  1.2117824  1.4776425  1.5733690  1.0785898  0.8406213  1.0027079
##  [43]  1.3518229  1.3199309  1.2229709  1.3069607  1.4428337  1.3916583
##  [49]  1.5774606  3.4798832  2.7693829  3.3271570  2.6801845  3.0497978
##  [55]  2.8179059  2.7081117  2.8899521  2.0161804  1.9808458  1.8333730
##  [61]  2.0168364  1.7802773  1.6040024  1.4351711  0.9588706  1.8628268
##  [67]  1.3064317  2.1206894  1.4795010  1.3306186 14.5551906  7.6889506
##  [73]  1.9315503  3.2577544  3.5552235  4.2886842  4.4615923  5.3810691
##  [79]  4.0807105  3.6968467  3.0630148  2.9682985  2.8998619  2.4043797
##  [85]  2.4129995  2.6090009  2.3764793  2.6860665  2.2573893  1.7885452
##  [91]  2.8479019  1.8238429  2.3087738  2.1931886  2.7035691  3.1249869
##  [97]  2.8051889 16.4519602 25.0294728 19.9644258 16.6320573 16.7146707
## [103] 12.1522414 11.6501266  9.1198348  6.5903549  4.5559061  3.6969860
## [109]  2.9997822  2.5020343  2.3892037  2.6375189  2.1300700  1.8588336
## [115]  2.6904052  2.5550291  1.7681094  1.8333074  4.3819418  2.4056640
## [121]  1.9947108  3.8649705  2.5167407  1.9478483  2.0165614  1.8805844
## [127]  1.8095385  2.0423607  1.8088296  2.3581922  1.7220284  2.4644528
## [133]  1.7973765  1.7919945  2.2592240  1.1511978  1.9754137  1.7684362
## [139]  4.2892584  1.1722152  1.5803833  1.2530491  2.0851733  2.1094615
## [145]  1.5477366  1.8205729  2.1500770  1.5220204  2.1010737  1.5288082
## [151]  1.4488927  1.6211477  1.7482352  1.6639982  2.1853334  2.2754807
## [157]  2.0504857  1.7761231  1.6097263  1.6406558  1.1991648  1.6740114
## [163]  1.3819978  3.3999477  1.1871686  1.2596185  0.7915277  0.8660867
for (i in 1:7) {
  plot(one_var[(24*(i-1)+1):((24*i))], type = 'b')
}

plot(one_var, type = 'b')